library(tidyverse)
library(janitor)
library(lubridate)
library(ggplot2)
library(e1071)
library(modelr)
library(dplyr)
books <- read.csv("books.csv") %>%
clean_names()
(books %>%
dim())
[1] 11131 12
books_cleaned <- books %>%
drop_na(ratings_count) %>%
filter(ratings_count != 0) %>%
filter(!average_rating %in% c(" Jr./Sam B. Warner",
" one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)",
" Rawles",
" Son & Ferguson"
))
The dataset has 11,131 rows and 12 variables. We are not concerned with books that have no ratings or books that have a rating count of 0. These have been removed.
A number of entries in the “average rating” category are strings of text. These are also removed, we now have 11,043 rows in the dataset.
The 12 variables in the dataset are:
(books_cleaned %>%
names())
[1] "book_id" "title" "authors"
[4] "average_rating" "isbn" "isbn13"
[7] "language_code" "num_pages" "ratings_count"
[10] "text_reviews_count" "publication_date" "publisher"
book_id, isbn and isbn13 are all unique identifiers for each book. We will remove book_id and isbn and use isbn13 as the most appropriate unique identifier.
num_pages has several entries with ‘0’. This questions the reliability of this variable and so it is removed from our dataset.
books_cleaned <- books_cleaned %>%
select(-c(book_id, isbn, num_pages))
(books_cleaned %>%
glimpse())
Rows: 11,043
Columns: 9
$ title <chr> "Harry Potter and the Half-Blood Prince (Harr…
$ authors <chr> "J.K. Rowling/Mary GrandPré", "J.K. Rowling/M…
$ average_rating <chr> "4.57", "4.49", "4.42", "4.56", "4.78", "3.74…
$ isbn13 <chr> "9780439785969", "9780439358071", "9780439554…
$ language_code <chr> "eng", "eng", "eng", "eng", "eng", "en-US", "…
$ ratings_count <int> 2095690, 2153167, 6333, 2339585, 41428, 19, 2…
$ text_reviews_count <int> 27591, 29221, 244, 36325, 164, 1, 808, 254, 4…
$ publication_date <chr> "9/16/2006", "9/1/2004", "11/1/2003", "5/1/20…
$ publisher <chr> "Scholastic Inc.", "Scholastic Inc.", "Schola…
This leaves us with 9 variables.
We now tidy these variables to ensure they are of the correct data type for further analysis.
books_cleaned <- books_cleaned %>%
mutate(average_rating = as.double(average_rating),
language_code = as.factor(language_code),
publication_date = mdy(publication_date)) %>%
drop_na(publication_date)
Two entries have incorrect publications dates (31st June and 31st November) given the unreliability of the data, these entries have also been removed.
ten_oldest_books <- books_cleaned %>%
arrange(publication_date) %>%
head(10)
(plot_ten_oldest <- ten_oldest_books %>%
ggplot(aes(x = publication_date, y = average_rating)) +
geom_col())
The ten oldest books in the dataset were published between 1900 and 1928. Number of ratings range from 21-332 and average ratings range from 3.91 - 4.35.
ten_newest_books <- books_cleaned %>%
arrange(desc(publication_date)) %>%
head(10)
(plot_ten_newest <- ten_newest_books %>%
ggplot(aes(x = publication_date, y = average_rating)) +
geom_col())
The 10 newest books in the dataset were published between July 2018 and March 2020. Rating count ranges from 9-56171 and average ratings range from 3.43 to 4.50.
So far the data is showing that there is a narrow range in average rating, regardless of when the book was published or the ratings count.
If we round the average rating and compare the number of each rounded rating we get:
books_cleaned_average <- books_cleaned %>%
mutate(rounded_rating = round(average_rating), .after = average_rating)
(average_rating_summary <- books_cleaned_average %>%
group_by(rounded_rating) %>%
summarise(rounded_rating_count = n()) %>%
mutate(percentage = round(rounded_rating_count /
sum(rounded_rating_count) * 100, 2))
)
Almost 92% of books have an average rating of 4. So what does it take to get a below average rating (1-3) or an above average rating (5)?
(rating_two_or_one <- books_cleaned_average %>%
filter(rounded_rating <= 2))
With the exception of “Citizen Girl” all of the books with a rating of 2 or 1 have a ratings count of <5 . Citizen Girl has a ratings count of 5415.
This suggests that there is a potential pattern in the number of reviews and the overall average rating. This is explored further by looking a the rating count ranges for each of the average score categories.
(rating_count_ranges <- books_cleaned_average %>%
group_by(rounded_rating) %>%
summarise(min_review_count = min(ratings_count),
max_review_count = max(ratings_count))
)
NA
NA
(ratings_boxplot <- books_cleaned_average %>%
mutate(rounded_rating = as.factor(rounded_rating)) %>%
ggplot(aes(x = rounded_rating,
y= ratings_count)) +
geom_boxplot())
Due to the range of results it is impossible to infer anything specific from these boxplots; however, there appears to be a few outliers that are perhaps skewing our data. This leads to the question, is there a pattern to what makes more people read and review a book?
We can break our data down to look at the top 5% most reviewed books.
top_5_percent_ratings_count <- books_cleaned_average %>%
mutate(top_5_percent_ratings_count =
percent_rank(ratings_count) > 0.95) %>%
filter(top_5_percent_ratings_count == TRUE)
Summary of the top 5% most reviewed books (552 books):
(top_5_percent_ratings_count %>%
summary())
title authors average_rating rounded_rating
Length:552 Length:552 Min. :3.130 Min. :3.000
Class :character Class :character 1st Qu.:3.877 1st Qu.:4.000
Mode :character Mode :character Median :4.030 Median :4.000
Mean :4.015 Mean :3.989
3rd Qu.:4.170 3rd Qu.:4.000
Max. :4.590 Max. :5.000
isbn13 language_code ratings_count text_reviews_count
Length:552 eng :531 Min. : 61639 Min. : 109
Class :character en-US : 13 1st Qu.: 84114 1st Qu.: 2356
Mode :character spa : 4 Median : 128474 Median : 4014
en-GB : 2 Mean : 273958 Mean : 6946
fre : 2 3rd Qu.: 248678 3rd Qu.: 7563
ale : 0 Max. :4597666 Max. :94265
(Other): 0
publication_date publisher top_5_percent_ratings_count
Min. :1952-12-01 Length:552 Mode:logical
1st Qu.:2001-06-12 Class :character TRUE:552
Median :2004-01-30 Mode :character
Mean :2002-12-03
3rd Qu.:2006-01-17
Max. :2014-07-29
Perhaps one area to explore is whether there are publishers that get better reviews than others.
(top_5_percent_ratings_count %>%
distinct(publisher) %>%
arrange(publisher))
It looks like there is inconsistent naming of publishers, or often one publishing house has multiple divisions that they publsih under. This means looking at publishing info will require a lot of tidying and is likely not worth it.
Let’s looks at books that have been published in multiple languages
published_more_than_once <- books_cleaned_average %>%
filter(duplicated(title) | duplicated(title, fromLast = TRUE)) %>%
arrange(title)
(published_more_than_once %>%
distinct(title))
How many languages are represented in the books published more than once?
(published_more_than_once %>%
distinct(language_code))
It looks like 9 langauges, but 4 of these are English. We will group “eng”, “en-CA”, “en-GB” and “en-US” together, to be left with 6 distinct languages: - Aleut - English - French - German - Greek - Spanish
#grouping all versions of English together and calculating mean average rating per book, per language.
published_more_than_once <- published_more_than_once %>%
mutate(language_code = case_when(
language_code %in% c("en-CA", "en-GB", "en-US") ~ "eng",
.default = language_code)
) %>%
group_by(title, language_code) %>%
summarise(average_rating = mean(average_rating))
`summarise()` has grouped output by 'title'. You can override using the `.groups` argument.
Use our grouped data we can look at books that have been published in multiple languages:
published_multiple_languages <- published_more_than_once %>%
filter(duplicated(title) | duplicated(title, fromLast = TRUE)) %>%
arrange(title)
We find that there are 19 books published in multiple languages:
(published_multiple_languages %>%
distinct(title))
Do books do better in different languages, or do they score roughly the same?
(mult_languages_pivot <- published_multiple_languages %>%
pivot_wider(names_from = language_code, values_from = average_rating))
NA
#adding colour palette
cbb_palette_language <- c("#56B4E9", "#0072B2","#D55E00", "#E69F00", "darkgreen", "#009E73")
published_multiple_languages %>%
ggplot(aes(x = title, y = average_rating,
fill = language_code)) +
geom_col(position = "dodge") +
scale_fill_manual(values = cbb_palette_language) +
theme_bw() +
labs(title = "Books published in multiple languages",
subtitle = "Comparing ratings across languages") +
theme(
text = element_text(size = 10),
axis.text.x = element_text(size = 6,
angle = 45,
hjust = 1),
axis.text.y = element_text(size = 6))
Jane Eyre averaged slightly lower in English than in German (4.11 to 4.12).
Trainspotting scored slightly lower in English than in French (4.03 to 4.09).
Let’s add the variations on English back in to see if this changes anything:
published_more_than_once_english <- books_cleaned_average %>%
filter(duplicated(title) | duplicated(title, fromLast = TRUE)) %>%
arrange(title) %>%
group_by(title, language_code) %>%
summarise(average_rating = mean(average_rating)) %>%
filter(duplicated(title) | duplicated(title, fromLast = TRUE))
`summarise()` has grouped output by 'title'. You can override using the `.groups` argument.
(mult_languages_pivot_eng <- published_more_than_once_english %>%
pivot_wider(names_from = language_code, values_from = average_rating))
This hasn’t been looked at in any details yet. It appears that there are minor variations, but nothing that stands out. It might be worth exploring this further. This data could perhaps let publishers and authors get an idea of what languages might be worth publishing in. Or if readers of of variation of English tend to give better reviews.
Looking at at the average rating per language for all books
(language_avg <- books_cleaned_average %>%
group_by(language_code) %>%
summarise(count = n(), mean_avg = mean(average_rating))
)
Need to removed langauges with fewest review numbers. Suggest >20.
#looking at the average review score for languages that appear >20
(language_avg_20 <- books_cleaned_average %>%
group_by(language_code) %>%
filter(n() > 20) %>%
summarise(count = n(), mean_avg = mean(average_rating))
)
language_avg_20 %>%
ggplot(aes(x = language_code, y = mean_avg, fill = language_code)) +
geom_col() +
theme_bw() +
theme(
text = element_text(size = 10),
axis.text.x = element_text(size = 6,
angle = 45,
hjust = 1),
axis.text.y = element_text(size = 6))
What is the average across all books?
(avg_all_books <- books_cleaned_average %>%
summarise(mean_avg = mean(average_rating)))
(language_avg_20_above_below <- language_avg_20 %>%
mutate(mean = 3.943) %>%
mutate(above_or_below = if_else(mean_avg > mean, "above", "below"))
)
above_below_palette <- c("darkgreen", "darkred")
language_avg_20_above_below %>%
filter(language_code != c("eng", "en-US")) %>%
ggplot(aes(x = language_code, y = mean_avg, fill = above_or_below)) +
geom_col() +
scale_fill_manual(values = above_below_palette) +
geom_label(aes(label = round((mean_avg), 2))) +
theme_bw()
Warning: There were 2 warnings in `filter()`.
The first warning was:
ℹ In argument: `language_code != c("eng", "en-US")`.
Caused by warning in `!=.default`:
! longer object length is not a multiple of shorter object length
ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 1 remaining warning.
Maybe there is something here? Take out en-US and eng to leave us with a more comfortable range of review numbers?
#reproducing cleaned data, this time keeping num_pages
books_cleaned_v2 <- books %>%
drop_na(ratings_count) %>%
filter(ratings_count != 0) %>%
filter(!average_rating %in% c(" Jr./Sam B. Warner",
" one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)",
" Rawles",
" Son & Ferguson"
)) %>%
select(-c(book_id, isbn)) %>%
mutate(num_pages = as.integer(num_pages),
ratings_count = as.integer(ratings_count)) %>%
arrange(num_pages)
books_cleaned_v2 %>%
ggplot(aes(x = num_pages, y = ratings_count)) +
geom_point() +
labs(x = "Number of Pages", y = "Number of Ratings")
# how many books have <100 pages?
books_cleaned_v2 %>%
filter(num_pages <100)
books_cleaned_v2 %>%
filter(num_pages == 0)
books_cleaned_v2 %>%
filter(num_pages <10,
num_pages > 0)
NA
1,010 books have <100 pages, 75 of these have a page count of 0 and 116 have a page count of between 1 & 9. Given the unlikelihood of this many books being this short, it is clear that there are errors in this variable.
There is no known appropriate number for fewest pages a book may have. To try to eliminate any incorrect data, we remove the lowest 5% of num_pages.
Our histogram above also show that there are some outlier books that have a high number of pages. There appears to be a steady range of books with
# checking books with >1000 pages
books_cleaned_v2 %>%
filter(num_pages >1000)
NA
There are 215 books with >1000 pages. Some of these appear to be collections of books, e.g. “The Border Trilogy”. For data consistency, we will also remove the lowest 5% of num_pages.
# Removing the bottom and top 5% of page count
num_rows <- nrow(books_cleaned_v2)
bottom_percentage <- 0.05
top_percentage <- 0.95
bottom_rows_to_remove <- round(num_rows * bottom_percentage)
top_rows_to_remove <- round(num_rows * (1 - top_percentage))
# Remove the bottom and top rows
trimmed_books_v1 <- books_cleaned_v2[(bottom_rows_to_remove + 1):(num_rows - top_rows_to_remove), , drop = FALSE]
trimmed_books_v1 %>%
ggplot(aes(x = num_pages, y = ratings_count)) +
geom_point() +
labs(x = "Number of Pages", y = "Number of Ratings")
trimmed_books_v2 <- trimmed_books_v1 %>%
arrange(ratings_count)
# Removing the bottom and top 5% of ratings count
num_rows <- nrow(trimmed_books_v2)
bottom <- 0.05
top <- 0.95
bottom_remove <- round(num_rows * bottom)
top_remove <- round(num_rows * (1 - top))
# Remove the bottom and top rows
trimmed_books_v2 <- trimmed_books_v2[(bottom_remove + 1):(num_rows - top_remove), , drop = FALSE]
trimmed_books_v2 %>%
ggplot(aes(x = num_pages, y = ratings_count)) +
geom_point() +
labs(x = "Number of Pages", y = "Number of Ratings")
model <- lm(ratings_count ~ num_pages, data = trimmed_books_v2)
summary(model)
Call:
lm(formula = ratings_count ~ num_pages, data = trimmed_books_v2)
Residuals:
Min 1Q Median 3Q Max
-7850 -4785 -3818 -631 57553
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3147.9293 261.0676 12.058 <2e-16 ***
num_pages 6.3403 0.7488 8.467 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10340 on 8943 degrees of freedom
Multiple R-squared: 0.007953, Adjusted R-squared: 0.007842
F-statistic: 71.69 on 1 and 8943 DF, p-value: < 2.2e-16
plot(model)
data1 <- trimmed_books_v2 %>%
add_predictions(model)
data1 %>%
ggplot(aes(x = num_pages)) +
geom_point(aes(y = ratings_count)) +
geom_line(aes(y = pred), col = "red")
trimmed_books_v2 %>%
summarise(skewness = skewness(ratings_count, type = 1))
The data is highly right skewed. In this case we should either look at the median as our stat, or look at standardising the data.
summary(trimmed_books_v2)
title authors average_rating isbn13 language_code num_pages
Length:8945 Length:8945 Length:8945 Length:8945 Length:8945 Min. : 51.0
Class :character Class :character Class :character Class :character Class :character 1st Qu.:208.0
Mode :character Mode :character Mode :character Mode :character Mode :character Median :300.0
Mean :316.6
3rd Qu.:400.0
Max. :750.0
ratings_count text_reviews_count publication_date publisher
Min. : 10 Min. : 0.0 Length:8945 Length:8945
1st Qu.: 165 1st Qu.: 12.0 Class :character Class :character
Median : 859 Median : 52.0 Mode :character Mode :character
Mean : 5155 Mean : 234.3
3rd Qu.: 4414 3rd Qu.: 214.0
Max. :62954 Max. :5699.0
num_pages now ranges from 51-750.
Use this to add book range categories:
trimmed_books_with_range <- trimmed_books_v2 %>%
mutate(page_range = case_when(
num_pages <= 150 ~ "51-150",
num_pages <= 250 ~ "151-250",
num_pages <= 350 ~ "251-350",
num_pages <= 450 ~ "351-450",
num_pages <= 550 ~ "451-550",
num_pages <= 650 ~ "551-650",
num_pages <= 750 ~ "651-750"
)) %>%
mutate(page_range = factor(page_range,
levels = c("51-150",
"151-250",
"251-350",
"351-450",
"451-550",
"551-650",
"651-750")))
set.seed(42)
sampled_trimmed_books_with_range <- trimmed_books_with_range %>% group_by(page_range) %>% slice_sample(n=100)
sampled_trimmed_books_with_range %>%
ggplot(aes(x = page_range, y = ratings_count)) +
geom_col()
(trimmed_range_summary <- sampled_trimmed_books_with_range %>%
group_by(page_range) %>%
summarise(median_ratings_count = median(ratings_count),
num_books = n()))
trimmed_books_with_range_normalised <- trimmed_books_with_range %>%
mutate(log_ratings_count = log(ratings_count))
trimmed_books_with_range_normalised %>%
ggplot(aes(x = page_range, y = log_ratings_count)) +
geom_col()
(trimmed_range_normalised_summary <- trimmed_books_with_range_normalised %>%
group_by(page_range) %>%
summarise(mean = mean(log_ratings_count),
median = median(log_ratings_count),
num_books = n()))